# This is code to validate and clean the raw data collected for the paper
# "A One Health framework for exploring the zoonotic web: a case study".

# Data collection by A. Vogl
# Code developed by A. Desvars-Larrive

################################ Requirements ################################

# Load packages
my_packages <- c("dplyr", "taxize","stringr", "rentrez")

# Extract not installed packages
not_installed <- my_packages[!(my_packages %in% installed.packages()[ , "Package"])]  
# Install not installed packages
if(length(not_installed)) install.packages(not_installed) 


# Import libraries
library(dplyr)
library(stringr)
library(taxize)
library(rentrez)


# Set the API key for the NCBI for a single R session 
## See how getting NCBI API key value: https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/
## See how using API key: https://cran.r-project.org/web/packages/rentrez/vignettes/rentrez_tutorial.html

set_entrez_key("your key") # write your API key value
Sys.getenv("ENTREZ_KEY")


# Set working directory to file location
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

# Import data 
zoon1.df <- read.csv("data_zoonoses_austria4.csv", 
                     header = TRUE, encoding="latin-9")
# Remove trailing spaces
# Lower case for some variables
zoon.df <- zoon1.df %>%
  mutate(across(where(is.character), str_trim)) %>%
  mutate(type_publication = tolower(type_publication)) %>% #lower case
  mutate(type_vector = tolower(type_vector)) %>% #lower case
  mutate(name_host = tolower(name_host))

################################ Check data quality ################################
# Check unique entries for each field
sort(unique(zoon.df$type_publication))
sort(unique(zoon.df$year_published))
sort(unique(zoon.df$type_study))
sort(unique(zoon.df$location) )
sort(unique(zoon.df$year_data_collect))
sort(unique(zoon.df$type_pathog)) 
sort(unique(zoon.df$pathogen_1))
sort(unique(zoon.df$pathogen_2))
sort(unique(zoon.df$type_vector))  
sort(unique(zoon.df$sci_name_vector_1))
sort(unique(zoon.df$sci_name_vector_2)) 
sort(unique(zoon.df$prev_vector))
sort(unique(zoon.df$type_host)) 
sort(unique(zoon.df$name_host)) 
sort(unique(zoon.df$sci_name_anim_host))  # scientific name as written in the paper
sort(unique(zoon.df$prev_anim_host)) 
sort(unique(zoon.df$type_env))
sort(unique(zoon.df$prev_env))
sort(unique(zoon.df$type_food_1))
sort(unique(zoon.df$type_food_2))
sort(unique(zoon.df$prev_food))
sort(unique(zoon.df$imported))
sort(unique(zoon.df$imported_country))
sort(unique(zoon.df$emerging))
sort(unique(zoon.df$profession_risk))

#################### Add NCBI-resolved host names and taxonomy (order, Family) to the dataset ####################

# Create a dataset with all the pathogens names (there can be more than one pathogen name in one cell in pathogen_2)
zoon.df_2 <- zoon.df %>% #
  dplyr::mutate(pathogen = strsplit(as.character(pathogen_2), ", ")) %>%
  unnest(pathogen) %>%
  mutate(across(where(is.character), stringr::str_trim))
## Check the table:
#View(zoon.df_2)

# List of host names which do not have scientific names 
host_names.df <- zoon.df_2 %>%
  dplyr::select(name_host, sci_name_anim_host) %>%
  dplyr::filter(!(is.na(name_host) == TRUE & is.na(sci_name_anim_host) == TRUE))

# find NCBI-resolved host scientific names from common names for those that do not have scientific name provided in the paper
comm_to_sci_host <- comm2sci(com=unique(host_names.df$name_host), db= "ncbi") 
# Choose Rattus norvegicus
# Choose Macaca fuscata
# Choose Buteo buteo
# Choose Erithacus rubecula (European robin)
# Choose Pongo pygmaeus
## If a mistake is made in the choice, should be fixed later in the script


comm_to_sci_host.df <- as.data.frame (unlist(comm_to_sci_host))
colnames(comm_to_sci_host.df) <- "ncbi_host_sci"
comm_to_sci_host.df$name_host <- rownames(comm_to_sci_host.df)

write.csv(comm_to_sci_host.df, "comm_to_sci_host.df.csv", row.names = FALSE)

# Add NCBI-resolved scientific names to the dataset
zoon.df_1 <- zoon.df_2 %>%
  left_join(comm_to_sci_host.df, by = "name_host") 

# Check if some scientific names retrieved from the paper do not match the scientific names retrieved from NCBI
zoon.df_1_diff <- zoon.df_1 %>% 
  dplyr::filter(!(zoon.df_1$sci_name_anim_host %in% zoon.df_1$ncbi_host_sci)) %>%
  dplyr::select(name_host, sci_name_anim_host, ncbi_host_sci) %>%
  dplyr::filter(is.na(ncbi_host_sci) == FALSE) %>%
  distinct()
zoon.df_1_diff 

# After manual check on the NCBI taxonomy website and information source, 
# we keep Capro ibex instead of Capra ibex ibex because the subspecies doe snot have common name in NCBI
# use the current taxonomic name for the eurasian water vole      Arvicola terrestris    -->  Arvicola amphibius (current name)

zoon.df_2 <- zoon.df_1 %>%
  dplyr::mutate(ncbi_host_sci = case_when(sci_name_anim_host == "Arvicola terrestris" ~ "Arvicola amphibius",
                                          TRUE ~ ncbi_host_sci))

# Complete NCBI-resolved scientific names using the scientific names provided in the publications
gnr_res_host_sci_name <- gnr_resolve (sci  = unique(zoon.df$sci_name_anim_host), preferred_data_sources = "4") 

zoon.df_3 <- zoon.df_2 %>%
  left_join(gnr_res_host_sci_name, by = c("sci_name_anim_host" = "user_supplied_name"), relationship = "many-to-many") %>%
  dplyr::select(-c(submitted_name, data_source_title, score)) %>%
  dplyr::mutate(ncbi_host_sci = if_else ( is.na(ncbi_host_sci) == TRUE, matched_name, ncbi_host_sci)) # only replace those for which the source does not provide scientific name


# Search hosts for which common and scientific name retrieved from the publication do not hit the same NCBI scientific name
zoon.df_3_diff <- zoon.df_3 %>%
  dplyr::filter(ncbi_host_sci != matched_name) %>%
  dplyr::select(name_host,sci_name_anim_host,ncbi_host_sci,matched_name ) %>%
  distinct()
zoon.df_3_diff

## check     
###  mouflon    Ovis orientalis Ovis aries musimon Ovis orientalis --> we should keep Ovis aries musimon
### deer     Cervus elaphus    Cervidae   Cervus elaphus --> we should keep  Cervus elaphus
### european herring gull   Larus argentatus Larus argentatus argenteus Larus argentatus --> we should keep Larus argentatus 

zoon.df_4 <- zoon.df_3 %>%
  dplyr::mutate(ncbi_host_sci = case_when(sci_name_anim_host == "Larus argentatus" ~ matched_name,
                                          sci_name_anim_host == "Cervus elaphus" ~ matched_name,
                                          TRUE ~ ncbi_host_sci)) %>%
  dplyr::select(-matched_name)


# Search for the remaining names that are not yet completed
to_complete_hosts_1 <-   zoon.df_4  %>%
  dplyr::select(name_host, sci_name_anim_host, ncbi_host_sci) %>%
  dplyr::filter(is.na(name_host) == FALSE & is.na(ncbi_host_sci) == TRUE) %>%
  distinct()
to_complete_hosts_1

#View(to_complete_hosts_1)

## Should be replaced by hand when possible (double check names with information source and NCBI taxonomy website):
### Import the dataset of inferred names
inferred_names <- read.csv("inferred_hosts.csv", 
                           header = TRUE, encoding="UTF-8")

inferred_names <- inferred_names %>%
  mutate(across(where(is.character), str_trim)) 
inferred_names


# domesticated muscovy duck Anas platyrhynchos domesticus --> the species cannot be inferred from the information source. However, the paper
# mentioned that this animal was "free ranging". We therefore decided to keep the scientific name Anas platyrhynchos, corresponding to the Mallard.
zoon.df_4.0 <- zoon.df_4 %>%
  dplyr::mutate(ncbi_host_sci = case_when(sci_name_anim_host == "Ovis orientalis musimon" ~ "Ovis aries musimon",
                                          sci_name_anim_host == "Anas platyrhynchos domesticus" ~ "Anas platyrhynchos",
                                          sci_name_anim_host == "Carduelis chloris" ~ "Chloris chloris",
                                          sci_name_anim_host == "Delichon urbica" ~ "Delichon urbicum",
                                          sci_name_anim_host == "Bombycilla garrulous" ~ "Bombycilla garrulus",
                                          sci_name_anim_host == "Nyctea scandiaca" ~ "Bubo scandiacus",
                                          sci_name_anim_host == "Sylvia communis" ~ "Curruca communis",
                                          TRUE ~ ncbi_host_sci))  %>%
  left_join(inferred_names, by = "name_host", relationship = "many-to-many") %>%
  mutate(ncbi_host_sci = case_when(is.na(ncbi_host_sci) & !is.na(inferred_name_host) ~ inferred_name_host,
                                   TRUE ~ ncbi_host_sci)) %>% # inferred_names complete the NCBI-resolved names
  dplyr::select(-inferred_name_host) # remove inferred_name_host column

write.csv(zoon.df_4.0 , "zoon.df_4.0.csv", row.names = FALSE)

zoon.df_4.0 <- read.csv("zoon.df_4.0.csv", 
                        header = TRUE, encoding="latin-9")

# Retrieve common name of the host from scientific name
sci_to_comm_host <- sci2comm(sci=unique(zoon.df_4.0$ncbi_host_sci), db = "ncbi")
sci_to_comm_host.df <- as.data.frame (unlist(sci_to_comm_host))
colnames(sci_to_comm_host.df) <- "ncbi_host_comm"
sci_to_comm_host.df$ncbi_host_sci<- rownames(sci_to_comm_host.df)

# Add NCBI-resolved common name to the dataset
zoon.df_4.1 <- zoon.df_4.0 %>%
  left_join(sci_to_comm_host.df, by = "ncbi_host_sci") 

write.csv(zoon.df_4.1 , "zoon.df_4.1.csv", row.names = FALSE)

zoon.df_4.1 <- read.csv("zoon.df_4.1.csv", 
                        header = TRUE, encoding="latin-9")

# Search missing common names
to_complete_hosts_2 <-   zoon.df_4.1  %>%
  dplyr::select(name_host, sci_name_anim_host, ncbi_host_sci, ncbi_host_comm) %>%
  dplyr::filter(is.na(name_host) == FALSE & is.na(ncbi_host_comm) == TRUE) %>%
  distinct()
to_complete_hosts_2

# all NCBI_host_sci are completed in this list
# extraction from NCBI did not work for:
# new-world camelid (unspecified) --> not specific enough
# Salmo trutta fario --> river trout 
# Panurus biarmicus --> bearded reedling (not to be found in NCBI, I used Wikipedia)
# Amazona --> not specific enough 
# Phasianinae --> not specific enough 
# Dendrocopos major --> great spotted woodpecker
# Riparia riparia --> sand martin (not to be found in NCBI, I used Wikipedia)
# Acrocephalus --> Acrocephalus (not specific enough)
# Agapornis --> Agapornis (not specific enough)
# Ficedula hypoleuca --> European pied flycatcher
# Zosterops minor --> black-fronted white-eye (this is not so clear, I used the name from the paper)
# Motacilla citreola --> citrine wagtail (not to be found in NCBI, I used Wikipedia)
# Prunella modularis --> dunnock (not to be found in NCBI, I used Wikipedia)
# Periparus ater --> coal tit (not to be found in NCBI, I used Wikipedia)
# Euphonia violacea --> violaceous euphonia (not to be found in NCBI, I used Wikipedia)
# Vulpes --> Vulpes
# lizard (unspecified) --> lizard (unspecified)  (not specific enough)
# wild ungulate (unspecified) --> wild ungulate (unspecified)  (not specific enough)
# Aquila heliaca --> eastern imperial eagle (not to be found in NCBI, I used Wikipedia)
# Larus argentatus --> herring gull
# Athene noctua --> little owl
# lizard (unspecified) --> lizard (unspecified)  (not specific enough)
# Anserinae --> Anserinae  (not specific enough)
# Ciconia --> Ciconia  (not specific enough)
# Columba --> Columba  (not specific enough)
# Alectoris --> Alectoris  (not specific enough)
# Acrocephalus scirpaceus --> Eurasian reed warbler
# Loxia curvirostra --> red crossbill
# Larus--> Larus (not specific enough) 
# Bombycilla garrulus --> bohemian waxwing
# Scolopax rusticola --> Eurasian woodcock


zoon.df_4.2 <- zoon.df_4.1 %>%
  dplyr::mutate(ncbi_host_comm = case_when(ncbi_host_sci == "new-world camelid (unspecified)" ~ "new-world camelid (unspecified)",
                                           ncbi_host_sci == "Salmo trutta fario" ~ "river trout",
                                           ncbi_host_sci == "Panurus biarmicus" ~ "bearded reedling",
                                           ncbi_host_sci == "Amazona" ~ "Amazona",
                                           ncbi_host_sci == "Phasianinae" ~ "Phasianinae",
                                           ncbi_host_sci == "Dendrocopos major" ~ "great spotted woodpecker",
                                           ncbi_host_sci == "Riparia riparia" ~ "sand martin",
                                           ncbi_host_sci == "Acrocephalus" ~ "Acrocephalus",
                                           ncbi_host_sci == "Agapornis" ~ "Agapornis",
                                           ncbi_host_sci == "Ficedula hypoleuca" ~ "European pied flycatcher",
                                           ncbi_host_sci == "Zosterops minor" ~ "black-fronted white-eye",
                                           ncbi_host_sci == "Motacilla citreola" ~ "citrine wagtail",
                                           ncbi_host_sci == "Prunella modularis" ~ "dunnock",
                                           ncbi_host_sci == "Periparus ater" ~ "coal tit",
                                           ncbi_host_sci == "Euphonia violacea" ~ "violaceous euphonia",
                                           ncbi_host_sci == "Vulpes" ~ "Vulpes",
                                           ncbi_host_sci == "lizard (unspecified)" ~ "lizard (unspecified)",
                                           ncbi_host_sci == "wild ungulate (unspecified)" ~ "wild ungulate (unspecified)",
                                           ncbi_host_sci == "Aquila heliaca" ~ "eastern imperial eagle",
                                           ncbi_host_sci == "Larus argentatus" ~ "herring gull",
                                           ncbi_host_sci == "Athene noctua" ~ "little owl",
                                           ncbi_host_sci == "lizard (unspecified)" ~ "lizard (unspecified)",
                                           ncbi_host_sci == "Anserinae" ~ "goose",
                                           ncbi_host_sci == "Ciconia" ~ "Ciconia",
                                           ncbi_host_sci == "Columba" ~ "Columba",
                                           ncbi_host_sci == "Alectoris" ~ "Alectoris",
                                           ncbi_host_sci == "Acrocephalus scirpaceus" ~ "Eurasian reed warbler",
                                           ncbi_host_sci == "Loxia curvirostra" ~ "red crossbill",
                                           ncbi_host_sci == "Larus" ~ "Larus",
                                           ncbi_host_sci == "Bombycilla garrulus" ~ "bohemian waxwing",
                                           ncbi_host_sci == "Scolopax rusticola" ~ "Eurasian woodcock",
                                           ncbi_host_sci == "bird of prey (unspecified)" ~ "raptor",
                                           ncbi_host_sci == "Meles meles" ~ "Eurasian badger",
                                           ncbi_host_sci == "Falco peregrinus" ~ "peregrine falcon",
                                           ncbi_host_sci == "Ciconia nigra" ~ "black stork",
                                           ncbi_host_sci == "Tyto alba" ~ "barn owl",
                                           ncbi_host_sci == "Vulpes vulpes" ~ "red fox",
                                           ncbi_host_sci == "Numididae sp." ~ "guineafowl",
                                           ncbi_host_sci == "Bubo scandiacus" ~ "snowy owl",
                                           ncbi_host_sci == "Phylloscopus collybita" ~ "eurasian chiffchaff",
                                           ncbi_host_sci == "Coraciiformes" ~ "kingfisher",
                                           ncbi_host_sci == "Falco cherrug" ~ "saker falcon",
                                           ncbi_host_sci == "Hirundo rustica" ~ "barn swallow",
                                           ncbi_host_sci == "Emberiza calandra" ~ "corn bunting",
                                           ncbi_host_sci == "Nestor notabilis" ~ "kea",
                                           ncbi_host_sci == "Falco rusticolus" ~ "gyrfalcon",
                                           TRUE ~ ncbi_host_comm))
# check again
zoon.df_4.2  %>%
  dplyr::select(name_host, sci_name_anim_host, ncbi_host_sci, ncbi_host_comm) %>%
  dplyr::filter(is.na(name_host) == FALSE & is.na(ncbi_host_comm) == TRUE) %>%
  distinct()
# this table should be empty


########### Retrieve the taxonomic hierarchy for a given taxon - host ###########

classif_host <- classification(unique(zoon.df_4.2$ncbi_host_sci), db = "ncbi") 

classif_host.long <- rbind(classif_host) %>%
  dplyr::select (-id) %>%
  dplyr::filter(rank %in% c("class", "subclass","order", "family", "genus")) 

classif_host.df <- spread(classif_host.long, rank, name) %>% 
  mutate(class = case_when(is.na(class) ~ subclass, #use subclass if class is missing (only Testudinidae)
                           TRUE ~ class)) %>%
  dplyr::select (query, class, order, family, genus) 

# Check if some hosts do not have a taxonomic classification
classif_host.check <-classif_host.df %>%
  dplyr::filter(is.na(order))
classif_host.check

# add the taxonomy to the data
zoon.df_5 <- left_join(zoon.df_4.2, classif_host.df,  by= c("ncbi_host_sci" = "query")) %>%
  dplyr::rename ("class_host" = "class","genus_host" = "genus", "family_host" = "family", "order_host" = "order")

zoon.df_5.1 <- zoon.df_5 %>%
  dplyr::mutate(class_host = case_when(ncbi_host_sci == "new-world camelid (unspecified)" ~ "Mammalia",
                                       ncbi_host_sci == "lizard (unspecified)" ~ "Lepidosauria",
                                       ncbi_host_sci == "wild ungulate (unspecified)" ~ "Mammalia",
                                       ncbi_host_sci == "raptor" ~ "Aves",
                                       TRUE ~ class_host))

write.csv(zoon.df_5.1, "zoon.df_5.1.csv", row.names = FALSE)

zoon.df_5.1 <- read.csv("zoon.df_5.1.csv", 
                        header = TRUE, encoding="latin-9")

################################ Add NCBI-resolved pathogen names and taxonomy (order, Family) to the dataset ################################

# List of pathogen names 
list_pathog_name <- sort(unique(zoon.df_2$pathogen))
#list_pathog_name

# Complete NCBI-resolved scientific names of the pathogen using the names provided in the publications
gnr_res_pathog <- gnr_resolve (sci  = list_pathog_name, preferred_data_sources = "4") 

gnr_res_pathog_subset <- gnr_res_pathog %>%
  dplyr::group_by(user_supplied_name) %>% 
  dplyr::filter(score == max(score)) %>%  # keep highest score
  slice_head() # keep first value

zoon.df_6 <- zoon.df_5.1 %>%
  left_join(gnr_res_pathog_subset , by = c("pathogen_1" = "user_supplied_name")) %>%
  dplyr::select(-c(submitted_name, data_source_title, score))

write.csv(zoon.df_6 , "zoon.df_6.csv", row.names = FALSE)

zoon.df_6  <- read.csv("zoon.df_6.csv", 
                       header = TRUE, encoding="latin-9")

# Search hosts for which name retrieved from the publication does not hit the same NCBI name
zoon.df_6_diff <- zoon.df_6 %>%
  dplyr::filter(pathogen != matched_name ) %>%
  dplyr::select(pathogen , matched_name) %>%
  distinct()
zoon.df_6_diff

# search those who do not have a match
zoon.df_6_diff_2 <- zoon.df_6 %>%
  dplyr::filter(is.na(matched_name)) %>%
  dplyr::select(pathogen , matched_name) %>%
  distinct()
zoon.df_6_diff_2


## Corrected manually using the NCBI Taxonomy webpage
### Some names cannot be replaced by more specific names.
### some names should be kept from the source (e.g. stapg. aureus MRSA, E. coli STEC, VTEC, EHEC, ESBL)
zoon.df_7 <- zoon.df_6 %>%
  dplyr::mutate(matched_name = case_when(pathogen == "Dobrava–Belgrade virus" ~ "Dobrava-Belgrade orthohantavirus",
                                         pathogen == "Dobrava–Belgrade orthohantavirus" ~ "Dobrava-Belgrade orthohantavirus",
                                         pathogen == "Candidatus Neoehrlichia mikurensis" ~ "Neoehrlichia mikurensis",
                                         pathogen == "Babesia venatorum" ~ "Babesia sp. venatorum",
                                         pathogen == "Dirofilaria immitis" ~ "Dirofilaria immitis",
                                         pathogen == "Dirofilaria repens" ~ "Dirofilaria repens",
                                         pathogen == "Babesia microti" ~ "Babesia microti",
                                         pathogen == "Babesia capreoli" ~ "Babesia capreoli",
                                         pathogen == "Babesia divergens" ~ "Babesia divergens",
                                         pathogen == "Babesia" ~ "Babesia",
                                         pathogen == "Theileria" ~ "Theileria",
                                         pathogen == "Anaplasma" ~ "Anaplasma",
                                         pathogen == "Mycobacterium microti" ~ "Mycobacterium tuberculosis variant microti",
                                         pathogen == "Mycobacterium marinum" ~ "Mycobacterium marinum",
                                         pathogen == "Mycobacterium fortuitum" ~ "Mycolicibacterium fortuitum",
                                         pathogen == "Mycobacterium chelonae" ~ "Mycobacterium chelonae",
                                         pathogen == "Borrelia burgdorferi" ~ "Borreliella burgdorferi",
                                         pathogen == "Borrelia burgdorferi sensu lato" ~ "Borreliella burgdorferi",
                                         pathogen == "Borrelia burgdorferi sensu stricto" ~ "Borreliella burgdorferi",
                                         pathogen == "Salmonella Dublin" ~ "Salmonella enterica subsp. enterica serovar Dublin",
                                         pathogen == "Salmonella infantis" ~ "Salmonella enterica subsp. enterica serovar Infantis",
                                         pathogen == "Bat-associated Bartonella" ~ "Bartonella",
                                         pathogen == "Rickettsia raoultii" ~ "Rickettsia conorii subsp. raoultii",
                                         pathogen == "Rickettsia monoacensis" ~ "Rickettsia monacensis",
                                         pathogen == "Rickettsia helvetica" ~ "Rickettsia helvetica",
                                         pathogen == "Salmonella Enteritidis" ~ "Salmonella enterica subsp. enterica serovar Enteritidis",
                                         pathogen == "Salmonella Typhimurium" ~ "Salmonella enterica subsp. enterica serovar Typhimurium",
                                         pathogen == "Mycobacterium caprae" ~ "Mycobacterium tuberculosis variant caprae",
                                         pathogen == "Puumala hantavirus" ~ "Puumala orthohantavirus",
                                         pathogen == "Lymphocytic choriomeningitis virus" ~ "Lymphocytic choriomeningitis mammarenavirus",
                                         pathogen == "Clostridium difficile" ~ "Clostridioides difficile",
                                         pathogen == "Equine rhinitis B virus" ~ "Erbovirus A",
                                         pathogen == "H1N1" ~ "H1N1 subtype",
                                         pathogen == "H3N3" ~ "H3N3 subtype",
                                         pathogen == "Puumala hantavirus" ~ "Puumala orthohantavirus",
                                         pathogen == "Hantaan hantavirus" ~ "Clostridioides difficile",
                                         pathogen == "Chlamydophila psittaci" ~ "Chlamydia psittaci",
                                         pathogen == "Chlamydia psittaci" ~ "Chlamydia psittaci",
                                         pathogen == "Chlamydia abortus" ~ "Chlamydia abortus",
                                         pathogen == "Trichinella brivoti" ~ "Trichinella britovi",
                                         pathogen == "Enterococcus faecalis" ~ "Enterococcus faecalis",
                                         pathogen == "Parainfluenza virus type 3" ~ "Human respirovirus 3",
                                         pathogen == "Rochalimaea henselae" ~ "Bartonella henselae",
                                         pathogen == "Chlamydophila felis" ~ "Chlamydia felis",
                                         pathogen == "H1N2" ~ "H1N2 subtype",
                                         pathogen == "H1N3" ~ "H1N3 subtype",
                                         pathogen == "Leptospira grippotyphosa" ~ "Leptospira kirschneri serovar Grippotyphosa",
                                         pathogen == "Rabies lyssavirus" ~ "Lyssavirus rabies",
                                         pathogen == "Mycobacterium bovis" ~ "Mycobacterium tuberculosis variant bovis",
                                         pathogen == "West Nile virus lineage 2" ~ "West Nile virus",
                                         pathogen == "West Nile virus lineage 3" ~ "West Nile virus",
                                         pathogen == "Japanese Ecephalitis virus" ~ "Japanese encephalitis virus",
                                         pathogen == "Zika virus" ~ "Zika virus",
                                         pathogen == "Usutu virus" ~ "Usutu virus",
                                         pathogen == "Dengue virus" ~ "Dengue virus",
                                         pathogen == "Yellow Fever virus" ~ "Yellow fever virus",
                                         pathogen == "Hepatitis E virus" ~ "Orthohepevirus A",
                                         pathogen == "dog hair" ~ "dog hair",
                                         pathogen == "Equine rhinitis virus serotype 1" ~ "Equine rhinitis A virus",
                                         pathogen == "Coronavirus" ~ "Coronavirus",
                                         pathogen == "Hantavirus" ~ "Orthohantavirus",
                                         pathogen == "Ancylostoma caninum" ~ "Ancylostoma caninum", 
                                         pathogen == "Toxocara canis" ~ "Toxocara canis",
                                         pathogen == "Trichuris vulpis" ~ "Trichuris vulpis",
                                         pathogen == "Adenovirus" ~ "Adenovirus",
                                         pathogen == "Herpesvirus" ~ "Herpesvirus",
                                         pathogen == "Cheyletiella mite" ~ "Cheyletiella",
                                         pathogen == "Staphylococcus aureus MRSA" ~ "Staphylococcus aureus MRSA",
                                         pathogen == "Staphylococcus aureus" ~ "Staphylococcus aureus",
                                         pathogen == "Streptococcus dysgalacitae" ~ "Streptococcus dysgalactiae",
                                         pathogen == "Escherichia coli STEC" ~ "Escherichia coli STEC",
                                         pathogen == "Escherichia coli VTEC" ~ "Escherichia coli VTEC",
                                         pathogen == "Escherichia coli EHEC" ~ "Escherichia coli EHEC",
                                         pathogen == "Escherichia coli EPEC" ~ "Escherichia coli EPEC",
                                         pathogen == "Genus species ESBL producing Escherichia coli" ~ "Escherichia coli ESBL",
                                         pathogen == "Anaplasma phagocytophilum" ~ "Anaplasma phagocytophilum",
                                         pathogen == "Ehrlichia canis" ~ "Ehrlichia canis",
                                         pathogen == "Campylobacter jejuni" ~ "Campylobacter jejuni",
                                         pathogen == "Campylobacter coli" ~ "Campylobacter coli",
                                         pathogen == "Salmonella abony" ~ "Salmonella enterica subsp. enterica serovar Abony",
                                         pathogen == "Campylobacter lari" ~ "Campylobacter lari",
                                         pathogen == "Giardia cati" ~ "Giardia cati",
                                         pathogen == "Giardia duodenalis" ~ "Giardia intestinalis",
                                         pathogen == "Cryptosporidium rivolta" ~ "Isospora rivolta",
                                         pathogen == "Cryptosporidium felis" ~ "Cryptosporidium felis",
                                         pathogen == "Listeria monocytogenes" ~ "Listeria monocytogenes",
                                         pathogen == "Salmonella derby" ~ "Salmonella enterica subsp. enterica serovar Derby",
                                         pathogen == "Bartonella taylorii" ~ "Bartonella taylorii",
                                         pathogen == "Bartonella grahamii" ~ "Bartonella grahamii",
                                         pathogen == "Bartonella birtlesii" ~ "Bartonella birtlesii",
                                         pathogen == "Bartonella doshiae" ~ "Bartonella doshiae",
                                         pathogen == "Salmonella Hhdar" ~ "Salmonella enterica subsp. enterica serovar Hadar",
                                         pathogen == "Salmonella montevideo" ~ "Salmonella enterica subsp. enterica serovar Montevideo",
                                         pathogen == "Salmonella saintpaul" ~ "Salmonella enterica subsp. enterica serovar Saintpaul",
                                         pathogen == "Salmonella senftenberg" ~ "Salmonella enterica subsp. enterica serovar Senftenberg",
                                         pathogen == "Salmonella newport" ~ "Salmonella enterica subsp. enterica serovar Newport",
                                         pathogen == "Salmonella blockley " ~ "Salmonella enterica subsp. enterica serovar Blockley",
                                         pathogen == "Toxocara cati" ~ "Toxocara cati",
                                         pathogen == "Rickettsia honei" ~ "Rickettsia honei",
                                         pathogen == "Brucella  microti" ~ "Brucella microti",
                                         pathogen == "Salmonella hadar" ~ "Salmonella enterica subsp. enterica serovar Hadar",
                                         pathogen == "Salmonella virchow" ~ "Salmonella enterica subsp. enterica serovar Virchow",
                                         pathogen == "Salmonella thompson" ~ "Salmonella enterica subsp. enterica serovar Thompson",
                                         pathogen == "Salmonella agona" ~ "Salmonella enterica subsp. enterica serovar Agona",
                                         pathogen == "Salmonella kentucky" ~ "Salmonella enterica subsp. enterica serovar Kentucky",
                                         pathogen == "Salmonella braenderup" ~ "Salmonella enterica subsp. enterica serovar Braenderup",
                                         pathogen == "Salmonella indiana" ~ "Salmonella enterica subsp. enterica serovar Indiana", 
                                         pathogen == "Leishmania infantum" ~ "Leishmania infantum",
                                         pathogen == "Salmonella brenderup" ~ "Salmonella enterica subsp. enterica serovar Braenderup",
                                         pathogen == "Salmonella livingstone" ~ "Salmonella enterica subsp. enterica serovar Livingstone",
                                         pathogen == "Echinococcus multilocularis" ~ "Echinococcus multilocularis",
                                         pathogen == "Echinococcus granulosus" ~ "Echinococcus granulosus",
                                         pathogen == "Salmonella enterica" ~ "Salmonella enterica",
                                         pathogen == "Leptospira interrogans" ~ "Leptospira interrogans",
                                         pathogen == "Campylobacter hyointestinalis" ~ "Campylobacter hyointestinalis",
                                         pathogen == "Brucella suis biovar 2" ~ "Brucella suis bv. 2",
                                         pathogen == "Brucella suis biovar 3" ~ "Brucella", # B. suis biovar 3 is not found in NCBI (melitensis or abortus?)
                                         pathogen == "Tick-borne encephalitis virus" ~ "Tick-borne encephalitis virus",
                                         pathogen == "Salmonella enteridis" ~ "Salmonella enterica subsp. enterica serovar Enteritidis",
                                         pathogen == "STEC Escherichia coli" ~ "Escherichia coli STEC",
                                         pathogen == "Yersinia enterocolitica" ~ "Salmonella enterica subsp. enterica",
                                         pathogen == "Salmonella houtenae" ~ "Salmonella enterica subsp. houtenae",
                                         pathogen == "Salmonella salamae" ~ "Salmonella enterica subsp. salamae",
                                         pathogen == "Salmonella diarizonae" ~ "Salmonella enterica subsp. diarizonae",
                                         pathogen == "Salmonella Paratyphi B variant Java" ~ "Salmonella enterica subsp. enterica serovar Paratyphi B",
                                         pathogen == "Salmonella Gatuni" ~ "Salmonella enterica subsp. enterica serovar Gatuni",
                                         pathogen == "Campylobacter upsaliensis" ~ "Campylobacter upsaliensis",
                                         pathogen == "Leptospira australis" ~ "Leptospira",
                                         pathogen == "Leptospira canicola" ~ "Leptospira interrogans serovar Canicola",
                                         pathogen == "Leptospira copenhageni" ~ "Leptospira interrogans serovar Copenhageni",
                                         pathogen == "Leptospira hardjo" ~ "Leptospira interrogans serovar Hardjo",
                                         pathogen == "Leptospira pomona" ~ "Leptospira", 
                                         pathogen == "Leptospira saxkoebing" ~ "Leptospira",  # can be 2 species
                                         pathogen == "Leptospira tarassovi" ~ "Leptospira",# can be 2 species
                                         pathogen == "Trichinella spiralis" ~ "Trichinella spiralis",
                                         pathogen == "Toxoascaris leonia" ~ "Toxoascaris leonina",
                                         pathogen == "Tick borne enchephalitis virus" ~ "Tick-borne encephalitis virus",
                                         pathogen == "Borrelia afzelii" ~ "Borreliella afzelii",
                                         pathogen == "Borrelia miyamotoi" ~ "Borrelia miyamotoi",
                                         pathogen == "Borrelia garinii" ~ "Borreliella garinii",
                                         pathogen == "Borrelia lustinaniae" ~ "Borreliella lusitaniae",
                                         pathogen == "Borrelia valesiana" ~ "Borreliella valaisiana",
                                         pathogen == "Toxoascaris leonia" ~ "Toxoascaris leonia",
                                         pathogen == "Tula virus" ~ "Orthohantavirus tulaense",
                                         pathogen == "Parainfluenza virus type 4" ~ "Parainfluenza virus 4 ", 
                                         pathogen == "Ancylostoma tubaeformae" ~ "Ancylostoma tubaeformae",
                                         pathogen == "Puumala virus" ~ "Orthohantavirus puumalaense",
                                         pathogen == "Uncinaria stenocephala" ~ "Uncinaria stenocephala",
                                         pathogen == "Encephalitozoon cuniculi" ~ "Encephalitozoon cuniculi",
                                         pathogen == "Baylisascaris columnaris" ~ "Baylisascaris columnaris",
                                         pathogen == " Eschericha coli" ~ "Escherichia coli",
                                         pathogen == "Borellia spielmanii" ~ "Borreliella spielmanii",
                                         pathogen == "Borrelia lusitaniae" ~ "Borreliella lusitaniae",
                                         pathogen == "Borrelia valaisiana" ~ "Borreliella valaisiana",
                                         TRUE ~ matched_name)) %>%
  dplyr::rename("ncbi_pathog" = "matched_name")


to_complete_pathog <-   zoon.df_7  %>%
  dplyr::select(pathogen, ncbi_pathog) %>%
  dplyr::filter(is.na(ncbi_pathog) == TRUE) %>%
  distinct() 
to_complete_pathog


# Retrieve the taxonomic hierarchy for a given taxon - pathogen
classif_pathog <- classification(unique(zoon.df_7$ncbi_pathog), db = "ncbi") 

## Coronavirus: Choose NA because it is unspecified in the paper
## Choose any for coronavirus since the taxonomy is the same
## Choose 2: genus kinetoplastids     Leishmania
## Choose 2: Yersinia
## Choose 2: Proteus


classif_pathog.long <- rbind(classif_pathog) %>%
  dplyr::select (-id) %>%
  dplyr::filter(rank %in% c("superkingdom","order", "family", "genus"))
classif_pathog.df <- spread(classif_pathog.long, rank, name) %>%
  dplyr::select (query, superkingdom, order, family, genus) 

# add the taxonomy to the data
zoon.df_8 <- left_join(zoon.df_7, classif_pathog.df,  by= c("ncbi_pathog" = "query")) %>%
  dplyr::rename ("superkingdom_pathog" ="superkingdom", "genus_pathog" = "genus", "family_pathog" = "family", "order_pathog" = "order")

write.csv(zoon.df_8 , "zoon.df_8.csv", row.names = FALSE)

zoon.df_8 <- read.csv("zoon.df_8.csv", 
                      header = TRUE, encoding="latin-9")

## Check that superkingdom == type_pathog
check_pathog_type <- zoon.df_8 %>%
  mutate(superkingdom_pathog = tolower(superkingdom_pathog)) %>% #lower case
  mutate(superkingdom_pathog = case_when(superkingdom_pathog== "viruses" ~ "virus")) %>%
  dplyr::filter(superkingdom_pathog != type_pathog) %>%
  dplyr::select(type_pathog, pathogen_1, ncbi_pathog, superkingdom_pathog)
check_pathog_type # should come back as empty table

# Search pathogen which did not get taxonomy
zoon.df_8_path_tax <- zoon.df_8 %>%
  dplyr::filter(!is.na(ncbi_pathog) & is.na(order_pathog)) %>%
  dplyr::select(type_pathog, pathogen_1, ncbi_pathog, order_pathog, family_pathog, genus_pathog)
zoon.df_8_path_tax 

# Encephalitozoon will be given the suborder (no order retrieved from NCBI datadabse)
# Escherichia coli STEC, Escherichia coli VTEC, Escherichia coli EHEC, Escherichia coli EPEC --> E. coli
# Staphylococcus aureus MRSA --> Staphylococcus aureus

#classif_e.coli <- classification(unique("Escherichia coli"), db = "ncbi") 
#classif_s.aureus <- classification(unique("Staphylococcus aureus"), db = "ncbi") 
#classif_Penta <- classification(unique("Pentastomida"), db = "ncbi") 
#classif_Herpesvirus <- classification(unique("Herpesvirus"), db = "ncbi") 
#classif_Cheyletiella <- classification(unique("Cheyletiella"), db = "ncbi") # I have searched in https://animaldiversity.org/accounts/Cheyletiella/classification/

zoon.df_8.0 <- zoon.df_8 %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Encephalitozoon cuniculi" ~ "Apansporoblastina",
                                         ncbi_pathog == "Encephalitozoon" ~ "Apansporoblastina",
                                         ncbi_pathog == "Escherichia coli STEC" ~ "Enterobacterales",
                                         ncbi_pathog == "Escherichia coli VTEC" ~ "Enterobacterales",
                                         ncbi_pathog == "Escherichia coli EHEC" ~ "Enterobacterales",
                                         ncbi_pathog == "Escherichia coli EPEC" ~ "Enterobacterales",
                                         ncbi_pathog == "Staphylococcus aureus MRSA" ~ "Bacillales",
                                         ncbi_pathog == "Cheyletiella" ~ "Prostigmata",
                                         !is.na(ncbi_pathog) & is.na(order_pathog) ~ "unknown",
                                         TRUE ~ order_pathog)) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Escherichia coli STEC" ~ "Enterobacteriaceae",
                                          ncbi_pathog == "Escherichia coli VTEC" ~ "Enterobacteriaceae",
                                          ncbi_pathog == "Escherichia coli EHEC" ~ "Enterobacteriaceae",
                                          ncbi_pathog == "Escherichia coli EPEC" ~ "Enterobacteriaceae",
                                          ncbi_pathog == "Staphylococcus aureus MRSA" ~ "Staphylococcaceae",
                                          ncbi_pathog == "Cheyletiella" ~ "Cheyletidae",
                                          !is.na(ncbi_pathog) & is.na(family_pathog) ~ "unknown",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(genus_pathog = case_when(ncbi_pathog == "Escherichia coli STEC" ~ "Escherichia",
                                         ncbi_pathog == "Escherichia coli VTEC" ~ "Escherichia",
                                         ncbi_pathog == "Escherichia coli EHEC" ~ "Escherichia",
                                         ncbi_pathog == "Escherichia coli EPEC" ~ "Escherichia",
                                         ncbi_pathog == "Staphylococcus aureus MRSA" ~ "Staphylococcus",
                                         ncbi_pathog == "Cheyletiella" ~ "Cheyletiella",
                                         !is.na(ncbi_pathog) & is.na(genus_pathog) ~ "unknown",
                                         TRUE ~ genus_pathog)) %>%
  dplyr::mutate(superkingdom_pathog = case_when(ncbi_pathog == "Escherichia coli STEC" ~ "Bacteria",
                                                ncbi_pathog == "Escherichia coli VTEC" ~ "Bacteria",
                                                ncbi_pathog == "Escherichia coli EHEC" ~ "Bacteria",
                                                ncbi_pathog == "Escherichia coli EPEC" ~ "Bacteria",
                                                ncbi_pathog == "Staphylococcus aureus MRSA" ~ "Bacteria",
                                                ncbi_pathog == "Cheyletiella" ~ "Eukaryota",
                                                ncbi_pathog == "Coronavirus" ~ "Viruses",
                                                ncbi_pathog == "Mycobacterium chelonae" ~ "Bacteria",
                                                ncbi_pathog == "Ancylostoma tubaeformae" ~ "Eukaryota",
                                                ncbi_pathog == "Escherichia coli ESBL" ~ "Bacteria",
                                                ncbi_pathog == "Parainfluenza virus 4" ~ "Viruses",
                                                ncbi_pathog == "Herpesvirus" ~ "Viruses",
                                                TRUE ~ superkingdom_pathog)) 

write.csv(zoon.df_8.0, "zoon.df_8.0.csv", row.names = FALSE)

zoon.df_8.0 <- read.csv("zoon.df_8.0.csv", 
                        header = TRUE, encoding="latin-9")


################# Add NCBI-resolved vector names and taxonomy (order, Family) to the dataset ###################

# List of vector names 
list_vector_sci_name <- unique(zoon.df$sci_name_vector_1) 
gnr_res_vect_sci_name <- gnr_resolve (sci  = list_vector_sci_name, preferred_data_sources = "4") 

gnr_res_vector_subset <- gnr_res_vect_sci_name  %>%
  dplyr::group_by(user_supplied_name) %>% 
  dplyr::filter(score == max(score)) %>%  # keep highest score
  slice_head() # keep first value

zoon.df_9 <- zoon.df_8.0 %>%
  left_join(gnr_res_vector_subset , by = c("sci_name_vector_1" = "user_supplied_name")) %>%
  dplyr::select(-c(submitted_name, data_source_title, score)) %>%
  dplyr::rename("ncbi_vector" = "matched_name")

write.csv(zoon.df_9, "zoon.df_9.csv", row.names = FALSE)

zoon.df_9 <- read.csv("zoon.df_9.csv", 
                      header = TRUE, encoding="latin-9")

# Search vectors for which name retrieved from the publication does not hit the same NCBI name
zoon.df_9_diff <- zoon.df_9 %>%
  dplyr::filter(is.na(ncbi_vector)) %>%
  dplyr::select(sci_name_vector_1,ncbi_vector ) %>%
  distinct()
zoon.df_9_diff # None!

# Retrieve the taxonomic hierarchy for a given taxon - host
classif_vector <- classification(unique(zoon.df_9$ncbi_vector), db = "ncbi") 
# choose genus over subspecies or subgenus

classif_vector.long <- rbind(classif_vector) %>%
  dplyr::select (-id) %>%
  dplyr::filter(rank %in% c("order", "family", "genus"))
classif_vector.df <- spread(classif_vector.long, rank, name) %>%
  dplyr::select (query, order, family, genus) 

# add the taxonomy to the data
zoon.df_10 <- left_join(zoon.df_9, classif_vector.df,  by= c("ncbi_vector" = "query")) %>%
  dplyr::rename ("genus_vector" = "genus", "family_vector" = "family", "order_vector" = "order")

# Search vector which did not get taxonomy
zoon.df_10_vec_tax <- zoon.df_10 %>%
  dplyr::filter(!is.na(ncbi_vector) & is.na(order_vector)) %>%
  dplyr::select(type_vector, sci_name_vector_1, ncbi_vector, order_vector, family_vector, genus_vector)

classification("Gastropoda", db = "ncbi") 
# Gastropoda is higher taxonomy than order. We cannot retrieve order from the information source.

# Few more corrections
zoon.df_10.0 <- zoon.df_10 %>%
  dplyr::mutate(superkingdom_pathog = case_when(pathogen == "Tahyna orthobunyavirus" ~ "Viruses",
                                                pathogen == "Crimean-Congo hemorrhagic fever orthonairovirus" ~ "Viruses",
                                                pathogen == "Tula orthohantavirus" ~ "Viruses",
                                                TRUE ~ superkingdom_pathog)) %>%
  dplyr::mutate(superkingdom_pathog = case_when(ncbi_pathog == "Toxoascaris leonina" ~ "Eukaryota",
                                                TRUE ~ superkingdom_pathog)) %>%
  dplyr::mutate(ncbi_pathog = case_when(pathogen == "Tahyna orthobunyavirus" ~ "Orthobunyavirus tahynaense",
                                        pathogen == "Crimean-Congo hemorrhagic fever orthonairovirus" ~ "Orthonairovirus haemorrhagiae",
                                        pathogen == "Tula orthohantavirus" ~ "Orthohantavirus tulaense",
                                        TRUE ~ ncbi_pathog)) %>%
  dplyr::mutate(order_pathog = case_when(ncbi_pathog == "Orthobunyavirus tahynaense" ~ "Bunyavirales",
                                         ncbi_pathog == "Orthonairovirus haemorrhagiae" ~ "Bunyavirales",
                                         ncbi_pathog == "Orthohantavirus tulaense" ~ "Bunyavirales",
                                         TRUE ~ order_pathog)) %>%
  dplyr::mutate(family_pathog = case_when(ncbi_pathog == "Orthobunyavirus tahynaense" ~ "Peribunyaviridae",
                                          ncbi_pathog == "Orthonairovirus haemorrhagiae" ~ "Nairoviridae",
                                          ncbi_pathog == "Orthohantavirus tulaense" ~ "Hantaviridae",
                                          TRUE ~ family_pathog)) %>%
  dplyr::mutate(genus_pathog = case_when(ncbi_pathog == "Orthobunyavirus tahynaense" ~ "Orthobunyavirus",
                                         ncbi_pathog == "Orthonairovirus haemorrhagiae" ~ "Orthonairovirus",
                                         ncbi_pathog == "Orthohantavirus tulaense" ~ "Orthohantavirus",
                                         TRUE ~ genus_pathog))


################################ Export the final table ################################

# Export the final table
write.csv(zoon.df_10.0 , "table_final.csv", row.names = FALSE)

